Mapping of Ki67 interactions with the genome and comparison with lamina interactions.
Various analyses of wildtype cells.
NA
Set the parameters and list the data.
# Load dependencies - without warnings / messages
library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
library(ggplot2)
library(RColorBrewer)
library(GGally)
library(corrr)
# Prepare output
output_dir <- "ts210520_ki67_wildtype_cells"
dir.create(output_dir, showWarnings = FALSE)
# Load input
input_dir <- "ts210413_data_gathering"
bin_size <- readRDS(file.path(input_dir, "bin_size.rds"))
centromeres <- readRDS(file.path(input_dir, "centromeres.rds"))
colors_set1 <- readRDS(file.path(input_dir, "colors_set1.rds"))
colors_set2 <- readRDS(file.path(input_dir, "colors_set2.rds"))
colors_set3 <- readRDS(file.path(input_dir, "colors_set3.rds"))
tib_padamid_replicates <- readRDS(file.path(input_dir,
"tib_padamid_replicates.rds"))
tib_padamid_combined <- readRDS(file.path(input_dir,
"tib_padamid_combined.rds"))
gr_padamid_replicates <- readRDS(file.path(input_dir,
"gr_padamid_replicates.rds"))
gr_padamid_combined <- readRDS(file.path(input_dir,
"gr_padamid_combined.rds"))
tib_hmm_replicates <- readRDS(file.path(input_dir, "tib_hmm_replicates.rds"))
tib_hmm_combined <- readRDS(file.path(input_dir, "tib_hmm_combined.rds"))
padamid_metadata_replicates <- readRDS(file.path(input_dir,
"padamid_metadata_replicates.rds"))
padamid_metadata_combined <- readRDS(file.path(input_dir,
"padamid_metadata_combined.rds"))library(knitr)
opts_chunk$set(fig.width = 10, fig.height = 4, cache = T,
dev=c('png', 'pdf'), fig.path = file.path(output_dir, "figures/"))
pdf.options(useDingbats = FALSE)# From Fede:
# ggpairs custom functions
corColor <- function(data, mapping, color = I("black"), sizeRange = c(1, 3), ...) {
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
r <- cor(x, y)
rt <- format(r, digits = 3)
tt <- as.character(rt)
cex <- max(sizeRange)
# helper function to calculate a useable size
percent_of_range <- function(percent, range) {
percent * diff(range) + min(range, na.rm = TRUE)
}
# plot correlation coefficient
p <- ggally_text(label = tt, mapping = aes(), xP = 0.5, yP = 0.5,
# size = I(percent_of_range(cex * abs(r), sizeRange)),
size = 6,
color = color, ...) +
theme(panel.grid.minor=element_blank(),
panel.grid.major=element_blank())
corColors <- RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")[2:6]
if (r <= boundaries[1]) {
corCol <- corColors[1]
} else if (r <= boundaries[2]) {
corCol <- corColors[2]
} else if (r < boundaries[3]) {
corCol <- corColors[3]
} else if (r < boundaries[4]) {
corCol <- corColors[4]
} else {
corCol <- corColors[5]
}
p <- p +
theme(panel.background = element_rect(fill = corCol))
return(p)
}
customScatter <- function (data, mapping)
{
p <- ggplot(data = data, mapping) +
geom_bin2d(bins = 100) +
geom_smooth(method = "lm", se = T, col = "red") +
scale_fill_gradient(low = "lightgrey", high = "black", name = "Count") +
theme_bw()
p
}
CorrelationPerChromosome <- function(tib, ki67, h3k27me3, h3k9me3, lmnb1,
plot_lmnb1 = F) {
# How does DMSO correlates with histone modifications?
tib <- tib %>%
dplyr::select(seqnames, matches(ki67),
matches(h3k27me3), matches(h3k9me3),
matches(lmnb1)) %>%
dplyr::rename_at(2:5, ~ c("ki67", "h3k27me3", "h3k9me3", "lmnb1")) %>%
drop_na() %>%
filter(seqnames != "chrY") %>%
group_by(seqnames) %>%
dplyr::summarise(h3k27me3 = cor(ki67, h3k27me3, method = "pearson"),
h3k9me3 = cor(ki67, h3k9me3, method = "pearson"),
lmnb1 = cor(ki67, lmnb1, method = "pearson")) %>%
ungroup() %>%
mutate(size = seqlengths(gr_padamid_replicates)[match(seqnames,
seqlevels(gr_padamid_replicates))]) %>%
arrange(-size) %>%
mutate(seqnames = factor(seqnames, levels = unique(seqnames))) %>%
gather(key, value, contains("h3k"), lmnb1) %>%
mutate(key = factor(key, c("h3k27me3", "h3k9me3", "lmnb1")))
# Remove lmnb1
if (plot_lmnb1 == F) {
tib <- tib %>%
filter(key != "lmnb1")
}
# Plot
plt <- tib %>%
ggplot(aes(x = seqnames, y = value, col = key)) +
geom_point() +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
xlab("") +
ylab("Pearson") +
ggtitle(ki67) +
scale_color_manual(values = RColorBrewer::brewer.pal(4, "Set1")[c(3, 4, 2)]) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)
invisible(tib)
}
PlotScatter <- function(tib, n1, n2, color_by = NULL, identity = F,
n = 1e99, alpha = 0.02) {
# Get tibble
tib <- tib %>%
dplyr::select(seqnames, one_of(n1), one_of(n2)) %>%
rename_all(~ c("seqnames", "n1", "n2"))
# Prepare color
if (! is.null(color_by)) {
tib <- tib %>%
add_column(color = color_by) %>%
drop_na()
alpha = 1
limits_color <- quantile(tib$color, c(0.001, 0.999), na.rm = T)
tib$color[tib$color < limits_color[1]] <- limits_color[1]
tib$color[tib$color > limits_color[2]] <- limits_color[2]
} else {
tib <- tib %>% drop_na()
tib$color = "1"
#alpha = alpha
}
tib <- tib %>%
arrange(sample(1:nrow(.), size = nrow(.), replace = F))
# Plot
xlimits <- quantile(tib$n1, c(0.001, 0.999), na.rm = T) * 1.4
ylimits <- quantile(tib$n2, c(0.001, 0.999), na.rm = T) * 1.4
n <- min(nrow(tib), n)
plt <- tib[1:n, ] %>%
ggplot(aes(x = n1, y = n2, color = color)) +
geom_point(size = 0.5, alpha = alpha) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
geom_vline(xintercept = 0, col = "black", linetype = "dashed") +
xlab(n1) +
ylab(n2) +
ggtitle(paste0("Spearman: ",
round(cor(tib$n1, tib$n2, use = "complete.obs",
method = "spearman"), 2))) +
coord_cartesian(xlim = xlimits, ylim = ylimits) +
theme_bw() +
theme(aspect.ratio = 1)
# Prepare color
if (! is.null(color_by)) {
plt <- plt +
scale_color_gradient2(low = "blue", mid = "grey", high = "red",
midpoint = 0)
} else {
plt <- plt +
scale_color_manual(values = "black", guide = F)
}
if (identity) plt <- plt + geom_abline(slope = 1, intercept = 0, col = "red", linetype = "dashed")
plot(plt)
}
PlotScatterBinned <- function(tib, n1, n2, color_by = NULL, identity = F,
n_min = 10, ylimits_col = c(-2.4, 2.4)) {
# Get tibble
tib_process <- tib %>%
dplyr::select(seqnames, all_of(n1), all_of(n2)) %>%
rename_all(~ c("seqnames", "n1", "n2"))
if (! is.null(color_by)) {
tib_process <- tib_process %>%
add_column(color = color_by)
}
tib_process <- tib_process %>%
drop_na()
# Change color range
if (! is.null(color_by)) {
limits_color <- quantile(tib_process$color, c(0.001, 0.999), na.rm = T)
tib_process$color[tib_process$color < limits_color[1]] <- limits_color[1]
tib_process$color[tib_process$color > limits_color[2]] <- limits_color[2]
}
# Metrics
n1_min = min(tib_process$n1) - 0.001
n1_max = max(tib_process$n1) + 0.001
n1_binsize <- (n1_max - n1_min) / 49
n2_min = min(tib_process$n2) - 0.001
n2_max = max(tib_process$n2) + 0.001
n2_binsize <- (n2_max - n2_min) / 49
tib_summary <- tib_process %>%
mutate(n1_cut = cut(n1,
seq(n1_min, n1_max, length.out = 50)),
n2_cut = cut(n2,
seq(n2_min, n2_max, length.out = 50))) %>%
mutate(n1_bin = as.numeric(as.factor(n1_cut)),
n2_bin = as.numeric(as.factor(n2_cut))) %>%
mutate(n1_bin = n1_min - n1_binsize/2 + n1_bin * n1_binsize,
n2_bin = n2_min - n2_binsize/2 + n2_bin * n2_binsize) %>%
group_by(n1_bin, n2_bin)
# Plot
if (! is.null(color_by)) {
tib_summary <- tib_summary %>%
dplyr::summarise(n = n(),
mark = mean(color)) %>%
ungroup() %>%
filter(n >= n_min)
plt <- tib_summary %>%
ggplot(aes(x = n1_bin, y = n2_bin)) +
geom_tile(aes(fill = mark)) +
scale_fill_gradient2(low = "blue", mid = "grey", high = "red",
midpoint = 0, limits = ylimits_col,
na.value = "green")
} else {
tib_summary <- tib_summary %>%
dplyr::summarise(n = n()) %>%
ungroup() %>%
filter(n >= n_min)
plt <- tib_summary %>%
ggplot(aes(x = n1_bin, y = n2_bin)) +
geom_tile(aes(fill = n)) +
scale_fill_gradient(low = "lightgrey", high = "black", name = "Count",
limits = c(0, 400), na.value = "green")
}
plt <- plt +
xlab(n1) +
ylab(n2) +
ggtitle(paste0("Pearson: ",
round(cor(tib_process$n1, tib_process$n2, use = "complete.obs",
method = "pearson"), 2))) +
theme_bw() +
theme(aspect.ratio = 1)
plot(plt)
}
quantiles <- function(x) {
# Use quantiles as boxplot boundaries
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}First, I want to confirm the enrichment at and near rDNA sequences. I will start by loading the enrichment at rDNA sequences from the pipeline.
# Prepare metadata
padamid_metadata_wildtype <- padamid_metadata_replicates %>%
filter(experiment == "wildtype" &
target == "Ki67")
# Load rDNA counts
rdna_counts <- tibble()
for (s in padamid_metadata_wildtype$sample) {
tmp <- read_tsv(str_interp("results/reports/experiment/${s}/${s}_rDNAEnrichment.txt"))
rdna_counts <- bind_rows(rdna_counts, tmp)
}
# Calculate enrichment over Dam
rdna_enrich <- rdna_counts %>%
dplyr::select(names, fraction) %>%
mutate(target = str_remove(names, ".*_"),
exp = str_remove(names, "_[^_]+$")) %>%
dplyr::select(-names) %>%
spread(target, fraction) %>%
mutate(ratio = log2(Ki67 / Dam),
cell = str_remove(exp, "_.*"),
cell = factor(cell, levels(padamid_metadata_wildtype$cell)))
# Plot enrichment
rdna_enrich %>%
ggplot(aes(x = cell, y = ratio, col = cell)) +
geom_point(size = 3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
xlab("") +
ylab("rDNA enrichment (log2)") +
scale_color_manual(values = colors_set2) +
theme_bw() +
theme(aspect.ratio = 1)I want to repeat this for centromeres, as this is almost the same.
# Load centr counts
centr_counts <- tibble()
for (s in padamid_metadata_wildtype$sample) {
tmp <- read_tsv(str_interp("results/reports/experiment/${s}/${s}_CentromereEnrichment.txt"))
centr_counts <- bind_rows(centr_counts, tmp)
}
# Calculate enrichment over Dam
centr_enrich <- centr_counts %>%
dplyr::select(names, centromere_extended.fraction) %>%
mutate(target = str_remove(names, ".*_"),
exp = str_remove(names, "_[^_]+$")) %>%
dplyr::select(-names) %>%
spread(target, centromere_extended.fraction) %>%
mutate(ratio = log2(Ki67 / Dam),
cell = str_remove(exp, "_.*"),
cell = factor(cell, levels(padamid_metadata_wildtype$cell)))
# Plot enrichment
centr_enrich %>%
ggplot(aes(x = cell, y = ratio, col = cell)) +
geom_point(size = 3) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
xlab("") +
ylab("centromere enrichment (log2)") +
scale_color_manual(values = colors_set2) +
theme_bw() +
theme(aspect.ratio = 1)These analyses are based on simple counts. I want to do a similar analysis based on the normalized scores themselves.
# Do this for the combined data
padamid_metadata_combined_wildtype <- padamid_metadata_combined %>%
# Problem: no H3K9me3 data in HCT116 cells - use DMSO condition instead
mutate(experiment = case_when(sample == "HCT116_ActD_DMSO_H3K9me3" ~ "wildtype",
T ~ as.character(experiment)),
experiment = factor(experiment,
levels(padamid_metadata_combined$experiment))) %>%
filter(experiment == "wildtype")
# Get similar chromosomes for the two objects and filter samples
chromosomes <- c(paste0("chr", 1:22), "chrX")
tib_padamid_wildtype <- tib_padamid_combined %>%
dplyr::select(seqnames, start, end,
one_of(padamid_metadata_combined_wildtype$sample)) %>%
filter(seqnames %in% chromosomes)
centromeres <- centromeres[seqnames(centromeres) %in% chromosomes]
# Add distance to centromeres to the GRanges object - in Mb
dis <- distanceToNearest(as(tib_padamid_wildtype, "GRanges"), centromeres)
tib_padamid_wildtype$distance_to_centromere <- mcols(dis)$distance / 1e6
# # Process
# tib <- tib_padamid_wildtype %>%
# dplyr::select(-start, -end) %>%
# mutate(rdna = as.logical(seqnames %in% paste0("chr", c(13, 14, 15, 21, 22)))) %>%
# gather(key, value, -distance_to_centromere, -rdna, -seqnames) %>%
# separate(key, c("cell", "condition", "target"), remove = F) %>%
# mutate(cell = factor(cell, levels = levels(padamid_metadata_combined_wildtype$cell)),
# dis_group = as.numeric(cut(distance_to_centromere,
# breaks = seq(0, max(distance_to_centromere) + 1,
# by = 1))) - 1)
#
#
# # Plot as boxplots
# # First, calculate boxplots to have more control over ylimits
# tib_summary <- tib %>%
# group_by(cell, dis_group, rdna) %>%
# drop_na() %>%
# summarise(ymin = quantile(value, 0.05),
# lower = quantile(value, 0.25),
# middle = quantile(value, 0.5),
# upper = quantile(value, 0.75),
# ymax = quantile(value, 0.95))
#
# tib_summary %>%
# filter(dis_group <= 30) %>%
# ggplot(aes(x = dis_group, ymin = ymin, lower = lower, middle = middle,
# upper = upper, ymax = ymax, group = dis_group, y = middle)) +
# geom_boxplot(stat="identity", outlier.shape = NA, fill = "lightgrey", col = "black") +
# geom_line(aes(y = middle, group = NULL), col = "red") +
# geom_hline(yintercept = 0, col = "blue", linetype = "dashed") +
# facet_grid(cell ~ rdna, scales = "free_y") +
# xlab("Distance to centromere (Mb)") +
# ylab("DamID (log2)") +
# #coord_cartesian(xlim = c(0, 30)) +
# theme_bw() +
# theme(aspect.ratio = 1)
#
# # Is the enrichment specific for rDNA chromosomes, or is chromosome size more
# # important?
# tib_summary <- tib %>%
# filter(dis_group < 2) %>%
# group_by(cell, seqnames) %>%
# drop_na() %>%
# summarise(mean = mean(value)) %>%
# ungroup() %>%
# mutate(seqnames = factor(seqnames, chromosomes),
# rdna = seqnames %in% paste0("chr", c(13, 14, 15, 21, 22))) %>%
# mutate(size = seqlengths(gr_padamid_replicates)[match(seqnames,
# seqlevels(gr_padamid_replicates))],
# size = size / 1e6)
#
# tib_summary %>%
# arrange(-size) %>%
# mutate(seqnames = factor(seqnames, levels = unique(seqnames))) %>%
# ggplot(aes(x = seqnames, y = mean, fill = cell, col = rdna)) +
# geom_bar(stat = "identity") +
# geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
# facet_grid(. ~ cell) +
# xlab("") +
# ylab("Ki67 score near centromeres") +
# scale_color_manual(values = c(NA, "black")) +
# scale_fill_manual(values = colors_set2, guide = F) +
# theme_bw() +
# theme(aspect.ratio = 1,
# axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
#
# # Add chromosome size and make a scatter plot
# tib_summary %>%
# ggplot(aes(x = size, y = mean, col = rdna)) +
# geom_point(size = 2) +
# geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
# xlab("Chromosome size (Mb)") +
# ylab("Ki67 score near centromeres (<2Mb)") +
# scale_color_grey() +
# facet_grid(. ~ cell) +
# coord_cartesian(xlim = c(0, 250)) +
# theme_bw() +
# theme(aspect.ratio = 1)Combined, it seems that Ki67 does not bind at rDNA sequences, but is enriched near centromeres. Scores at rDNA are only significantly higher than other small chromosomes in RPE cells.
Next, I want to compare cell types. I will make this comparison based on HMM calls and individual bins.
I should consider making this comparison for the combined values or individual replicates. For now, let’s start with the combined values.
Not good - do not include. HMM calls are too variable, not robust enough.
Honestly, I think the HMM calls are very poor for the Ki67 tracks. Especially the K562 calls are terrible.
This should be better than the HMM calls. Simply calculating the correlations between cell types.
# Use GGally to make correlation plots
boundaries <- seq(from = 0.1, to = 0.7, length.out = 4)
# Replicates
tib <- tib_padamid_replicates %>%
dplyr::select(padamid_metadata_wildtype$sample) %>%
drop_na()
plt <- ggpairs(tib,
upper = list(continuous = corColor),
lower = list(continuous = customScatter),
diag = list(continuous = function(data, mapping, ...) {
ggally_densityDiag(data = data, mapping = mapping, alpha = 0.3, fill = "red") +
theme_bw()})) +
ggtitle("Correlating all data") +
xlab("") +
ylab("")
print(plt)# Combined - between cell lines
tib <- tib_padamid_combined %>%
dplyr::select(padamid_metadata_combined_wildtype$sample) %>%
dplyr::select(ends_with("Ki67")) %>%
drop_na()
plt <- ggpairs(tib,
upper = list(continuous = corColor),
lower = list(continuous = customScatter),
diag = list(continuous = function(data, mapping, ...) {
ggally_densityDiag(data = data, mapping = mapping, alpha = 0.3, fill = "red") +
theme_bw()})) +
ggtitle("Correlating all data") +
xlab("") +
ylab("")
print(plt)# Combined - between antibodies
tib <- tib_padamid_combined %>%
dplyr::select(padamid_metadata_combined_wildtype$sample) %>%
dplyr::select(contains("Ki67") & contains("HCT116")) %>%
drop_na()
plt <- ggpairs(tib,
upper = list(continuous = corColor),
lower = list(continuous = customScatter),
diag = list(continuous = function(data, mapping, ...) {
ggally_densityDiag(data = data, mapping = mapping, alpha = 0.3, fill = "red") +
theme_bw()})) +
ggtitle("Correlating all data") +
xlab("") +
ylab("")
print(plt)I also want to make these plots for individual chromosomes. Scatterplots are too much, but simple pearson correlations should do the trick.
# Calculate pearson correlations per chromosome
CorrelationsPerChromosome <- function(input_tib, metadata) {
# Gather data and calculate pearson correlations
tib <- input_tib %>%
dplyr::select(seqnames, metadata$sample) %>%
drop_na()
# Calculate mean score per chromosome
tib_summary <- tib %>%
gather(key, value, -seqnames) %>%
mutate(cell = str_remove(key, "_.*"),
cell = factor(cell, levels(metadata$cell)),
seqnames = factor(seqnames, chromosomes),
rdna = seqnames %in% paste0("chr", c(13, 14, 15, 21, 22))) %>%
mutate(size = seqlengths(gr_padamid_replicates)[match(seqnames,
seqlevels(gr_padamid_replicates))],
size = size / 1e6) %>%
drop_na()
plt <- tib_summary %>%
arrange(-size) %>%
mutate(seqnames = factor(seqnames, levels = unique(seqnames))) %>%
ggplot(aes(x = seqnames, y = value, fill = cell, col = rdna)) +
#geom_boxplot(outlier.shape = NA) +
stat_summary(fun.data = quantiles, geom = "boxplot") +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
facet_grid(cell ~ .) +
xlab("") +
ylab("Ki67 score") +
coord_cartesian(ylim = c(-1.3, 1.3)) +
scale_fill_manual(values = colors_set2, guide = F) +
scale_color_manual(values = c("grey60", "grey0")) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)
plt <- tib_summary %>%
group_by(cell, size, seqnames) %>%
summarise(mean = mean(value),
# q75 = quantile(value, 0.75),
# q90 = quantile(value, 0.9),
q95 = quantile(value, 0.95)) %>%
ungroup() %>%
gather(key, value, mean, starts_with("q")) %>%
ggplot(aes(x = size, y = value, col = cell)) +
geom_point() +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
facet_grid(key ~ cell) +
xlab("Chromosome size (Mb)") +
ylab("Mean Ki67 score") +
coord_cartesian(xlim = c(0, 250)) +
scale_color_manual(values = colors_set2, guide = F) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)
tib_cor <- tib %>%
group_by(seqnames) %>%
nest() %>%
mutate(cordf = map(data, correlate)) %>%
unnest(cordf) %>%
dplyr::select(-data) %>%
gather(key, value, -seqnames, -term) %>%
ungroup() %>%
drop_na() %>%
mutate(n1 = str_remove(term, "_Ki67"),
n2 = str_remove(key, "_Ki67"),
cell1 = str_remove(term, "_.*"),
cell2 = str_remove(key, "_.*"),
cell1 = factor(cell1, levels(metadata$cell)),
cell2 = factor(cell2, levels(metadata$cell))) %>%
mutate(size = seqlengths(gr_padamid_replicates)[match(seqnames,
seqlevels(gr_padamid_replicates))],
size = size / 1e6) %>%
drop_na()
# Plot
plt <- tib_cor %>%
arrange(-size) %>%
mutate(seqnames = factor(seqnames, levels = unique(seqnames))) %>%
ggplot(aes(x = seqnames, y = value, col = cell2)) +
geom_point() +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
facet_grid(. ~ cell1) +
xlab("") +
ylab("Pearson correlation") +
coord_cartesian(ylim = c(min(0, min(tib_cor$value)), 1)) +
scale_color_manual(values = colors_set2) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)
plt <- tib_cor %>%
ggplot(aes(x = size, y = value, col = cell2)) +
geom_point() +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
facet_grid(. ~ cell1) +
xlab("Chromosome size (Mb)") +
ylab("Pearson correlation") +
coord_cartesian(xlim = c(0, 250),
ylim = c(min(0, min(tib_cor$value)), 1)) +
scale_color_manual(values = colors_set2) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)
}
# Correlations per chromosome
# CorrelationsPerChromosome(tib_padamid_replicates,
# padamid_metadata_wildtype)
CorrelationsPerChromosome(tib_padamid_combined,
padamid_metadata_combined_wildtype %>%
filter(target == "Ki67")) Are Ki67 interactions enriched near centromeres, and specifically those of rDNA chromosomes?
# Get similar chromosomes for the two objects and filter samples
chromosomes <- unique(tib_padamid_wildtype$seqnames)
tib <- tib_padamid_wildtype
centromeres <- centromeres[seqnames(centromeres) %in% chromosomes]
# Also, prepare telomeres
telomeres <- c(GRanges(seqnames = chromosomes,
ranges = IRanges(start = 1, end = 1)),
GRanges(seqnames = chromosomes,
ranges = IRanges(start = seqlengths(gr_padamid_combined)[chromosomes],
end = seqlengths(gr_padamid_combined)[chromosomes])))
seqinfo(telomeres) <- seqinfo(gr_padamid_combined)
telomeres <- sort(telomeres)
# Add distance to centromeres and telomeres to the GRanges object - in Mb
dis <- distanceToNearest(as(tib, "GRanges"), centromeres)
tib$distance_to_centromere <- mcols(dis)$distance / 1e6
dis <- distanceToNearest(as(tib, "GRanges"), telomeres)
tib$distance_to_telomere <- mcols(dis)$distance / 1e6
# Process
tib <- tib %>%
dplyr::select(seqnames,
contains("distance"),
contains("Ki67"),
-matches("HPA|NOV")) %>%
mutate(rdna = as.logical(seqnames %in% paste0("chr", c(13, 14, 15, 21, 22)))) %>%
gather(key, value, -contains("distance"), -rdna, -seqnames) %>%
separate(key, c("cell", "experiment", "target"), remove = F) %>%
mutate(match = match(key, padamid_metadata_combined$sample),
cell = padamid_metadata_combined$cell[match],
dis_group_centromere = as.numeric(cut(distance_to_centromere,
breaks = seq(0, max(distance_to_centromere) + 1,
by = 0.5)))/2 - 0.5,
dis_group_telomere = as.numeric(cut(distance_to_centromere,
breaks = seq(0, max(distance_to_telomere) + 1,
by = 0.5)))/2 - 0.5,
dis_group_telomere = ifelse(dis_group_telomere < 0, 0, dis_group_telomere)) %>%
dplyr::select(-contains("distance")) %>%
gather(class, class_value, contains("dis_group")) %>%
mutate(class = str_remove(class, "dis_group_")) %>%
drop_na()
# Is the enrichment specific for rDNA chromosomes, or is chromosome size more
# important?
tib_summary <- tib %>%
filter(class_value < 2,
class == "centromere") %>%
group_by(cell, seqnames, class) %>%
drop_na() %>%
summarise(mean = mean(value)) %>%
ungroup() %>%
mutate(seqnames = factor(seqnames, chromosomes),
rdna = seqnames %in% paste0("chr", c(13, 14, 15, 21, 22))) %>%
drop_na() %>%
mutate(size = seqlengths(gr_padamid_replicates)[match(seqnames,
seqlevels(gr_padamid_replicates))],
size = size / 1e6)## `summarise()` has grouped output by 'cell', 'seqnames'. You can override using the `.groups` argument.
tib_summary %>%
arrange(-size) %>%
mutate(seqnames = factor(seqnames, levels = unique(seqnames))) %>%
ggplot(aes(x = size, y = mean, col = rdna, shape = rdna)) +
geom_point(size = 2) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
facet_grid(cell ~ .) +
xlab("") +
ylab("Ki67 score near centromeres") +
scale_color_manual(values = c("black", "red")) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))tib_summary %>%
arrange(-size) %>%
mutate(seqnames = factor(seqnames, levels = unique(seqnames))) %>%
ggplot(aes(x = seqnames, y = mean, col = rdna, shape = rdna)) +
geom_point(size = 2) +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
facet_grid(cell ~ .) +
xlab("") +
ylab("Ki67 score near centromeres") +
scale_color_manual(values = c("black", "red")) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# Plot as boxplots
tib_summary <- tib %>%
filter(class == "centromere") %>%
group_by(cell, class_value) %>%
drop_na() %>%
summarise(ymin = quantile(value, 0.05),
lower = quantile(value, 0.25),
middle = quantile(value, 0.5),
upper = quantile(value, 0.75),
ymax = quantile(value, 0.95))## `summarise()` has grouped output by 'cell'. You can override using the `.groups` argument.
tib_summary %>%
filter(class_value <= 20) %>%
ggplot(aes(x = class_value, ymin = ymin, lower = lower, middle = middle,
upper = upper, ymax = ymax, group = class_value, y = middle)) +
geom_boxplot(stat="identity", outlier.shape = NA, fill = "lightgrey", col = "black") +
geom_line(aes(y = middle, group = NULL), col = "red") +
geom_hline(yintercept = 0, col = "blue", linetype = "dashed") +
facet_grid(cell ~ .) +
xlab("Distance to centromere (Mb)") +
ylab("Ki67") +
#coord_cartesian(xlim = c(0, 30)) +
theme_bw() +
theme(aspect.ratio = 1)Next, compare the Ki67 interactions with histone modifications (H3K27me3 and H3K9me3) and NL interactions.
Note: I have not generated H3K9me3 signal in wildtype HCT116 cells. I will use the DMSO condition instead.
set.seed(999)
tib_cor_rpe <- CorrelationPerChromosome(tib_padamid_wildtype,
ki67 = "RPE_wt_Ki67",
h3k27me3 = "RPE_wt_H3K27me3",
h3k9me3 = "RPE_wt_H3K9me3",
lmnb1 = "RPE_wt_LMNB1")tib_cor_hct116 <- CorrelationPerChromosome(tib_padamid_wildtype,
ki67 = "HCT116_wt_Ki67",
h3k27me3 = "HCT116_wt_H3K27me3",
h3k9me3 = "HCT116_ActD_DMSO_H3K9me3",
lmnb1 = "HCT116_wt_LMNB1")tib_cor_k562 <- CorrelationPerChromosome(tib_padamid_wildtype,
ki67 = "K562_wt_Ki67",
h3k27me3 = "K562_wt_H3K27me3",
h3k9me3 = "K562_wt_H3K9me3",
lmnb1 = "K562_wt_LMNB1")# Make combined plot
bind_rows(tib_cor_rpe %>%
add_column(cell = "RPE"),
tib_cor_hct116 %>%
add_column(cell = "HCT116"),
tib_cor_k562 %>%
add_column(cell = "K562")) %>%
mutate(cell = factor(cell, levels(padamid_metadata_wildtype$cell))) %>%
ggplot(aes(x = seqnames, y = value, col = key)) +
geom_point() +
geom_hline(yintercept = 0, col = "black", linetype = "dashed") +
xlab("") +
ylab("Pearson") +
facet_grid(. ~ cell) +
scale_color_manual(values = RColorBrewer::brewer.pal(4,
"Set1")[c(3, 4, 2)]) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))# Also, some scatter plots
PlotScatter(tib_padamid_wildtype, "RPE_wt_Ki67", "RPE_wt_LMNB1")PlotScatter(tib_padamid_wildtype, "RPE_wt_Ki67", "RPE_wt_LMNB1",
color_by = tib_padamid_wildtype$RPE_wt_H3K27me3)PlotScatter(tib_padamid_wildtype, "RPE_wt_Ki67", "RPE_wt_LMNB1",
color_by = tib_padamid_wildtype$RPE_wt_H3K9me3)PlotScatter(tib_padamid_wildtype, "HCT116_wt_Ki67", "HCT116_wt_LMNB1")PlotScatter(tib_padamid_wildtype, "HCT116_wt_Ki67", "HCT116_wt_LMNB1",
color_by = tib_padamid_wildtype$HCT116_wt_H3K27me3)PlotScatter(tib_padamid_wildtype, "HCT116_wt_Ki67", "HCT116_wt_LMNB1",
color_by = tib_padamid_wildtype$HCT116_ActD_DMSO_H3K9me3)PlotScatter(tib_padamid_wildtype, "K562_wt_Ki67", "K562_wt_LMNB1")PlotScatter(tib_padamid_wildtype, "K562_wt_Ki67", "K562_wt_LMNB1",
color_by = tib_padamid_wildtype$K562_wt_H3K27me3)PlotScatter(tib_padamid_wildtype, "K562_wt_Ki67", "K562_wt_LMNB1",
color_by = tib_padamid_wildtype$K562_wt_H3K9me3)# Now the "binned" scatter plots
PlotScatterBinned(tib_padamid_wildtype, "RPE_wt_Ki67", "RPE_wt_LMNB1")## `summarise()` has grouped output by 'n1_bin'. You can override using the `.groups` argument.
PlotScatterBinned(tib_padamid_wildtype, "RPE_wt_Ki67", "RPE_wt_LMNB1",
color_by = tib_padamid_wildtype$RPE_wt_H3K27me3,
ylimits_col = c(-2.4, 2.2))## `summarise()` has grouped output by 'n1_bin'. You can override using the `.groups` argument.
PlotScatterBinned(tib_padamid_wildtype, "RPE_wt_Ki67", "RPE_wt_LMNB1",
color_by = tib_padamid_wildtype$RPE_wt_H3K9me3,
ylimits_col = c(-1.8, 1.8))## `summarise()` has grouped output by 'n1_bin'. You can override using the `.groups` argument.
PlotScatterBinned(tib_padamid_wildtype, "HCT116_wt_Ki67", "HCT116_wt_LMNB1")## `summarise()` has grouped output by 'n1_bin'. You can override using the `.groups` argument.
PlotScatterBinned(tib_padamid_wildtype, "HCT116_wt_Ki67", "HCT116_wt_LMNB1",
color_by = tib_padamid_wildtype$HCT116_wt_H3K27me3,
ylimits_col = c(-2.4, 2.2))## `summarise()` has grouped output by 'n1_bin'. You can override using the `.groups` argument.
PlotScatterBinned(tib_padamid_wildtype, "HCT116_wt_Ki67", "HCT116_wt_LMNB1",
color_by = tib_padamid_wildtype$HCT116_ActD_DMSO_H3K9me3,
ylimits_col = c(-1.8, 1.8))## `summarise()` has grouped output by 'n1_bin'. You can override using the `.groups` argument.
PlotScatterBinned(tib_padamid_wildtype, "K562_wt_Ki67", "K562_wt_LMNB1")## `summarise()` has grouped output by 'n1_bin'. You can override using the `.groups` argument.
PlotScatterBinned(tib_padamid_wildtype, "K562_wt_Ki67", "K562_wt_LMNB1",
color_by = tib_padamid_wildtype$K562_wt_H3K27me3,
ylimits_col = c(-2.4, 2.2))## `summarise()` has grouped output by 'n1_bin'. You can override using the `.groups` argument.
PlotScatterBinned(tib_padamid_wildtype, "K562_wt_Ki67", "K562_wt_LMNB1",
color_by = tib_padamid_wildtype$K562_wt_H3K9me3,
ylimits_col = c(-1.8, 1.8))## `summarise()` has grouped output by 'n1_bin'. You can override using the `.groups` argument.
# # Same plots, filtered for LADs
# RPE_lad_idx <- which(tib_hmm_combined$RPE_wt_LMNB1 == "AD")
# PlotScatter(tib_padamid_wildtype[RPE_lad_idx, ],
# "RPE_wt_Ki67", "RPE_wt_LMNB1")
# PlotScatter(tib_padamid_wildtype[RPE_lad_idx, ],
# "RPE_wt_Ki67", "RPE_wt_LMNB1",
# color_by = tib_padamid_wildtype$RPE_wt_H3K27me3[RPE_lad_idx])
# PlotScatter(tib_padamid_wildtype[RPE_lad_idx, ],
# "RPE_wt_Ki67", "RPE_wt_LMNB1",
# color_by = tib_padamid_wildtype$RPE_wt_H3K9me3[RPE_lad_idx])
#
# HCT116_lad_idx <- which(tib_hmm_combined$HCT116_wt_LMNB1 == "AD")
# PlotScatter(tib_padamid_wildtype[HCT116_lad_idx, ],
# "HCT116_wt_Ki67", "HCT116_wt_LMNB1")
# PlotScatter(tib_padamid_wildtype[HCT116_lad_idx, ],
# "HCT116_wt_Ki67", "HCT116_wt_LMNB1",
# color_by = tib_padamid_wildtype$HCT116_ActD_DMSO_H3K9me3[HCT116_lad_idx])
# PlotScatter(tib_padamid_wildtype[HCT116_lad_idx, ],
# "HCT116_wt_Ki67", "HCT116_wt_LMNB1",
# color_by = tib_padamid_wildtype$HCT116_wt_H3K9me3[HCT116_lad_idx])
#
# K562_lad_idx <- which(tib_hmm_combined$K562_wt_LMNB1 == "AD")
# PlotScatter(tib_padamid_wildtype[K562_lad_idx, ],
# "K562_wt_Ki67", "K562_wt_LMNB1")
# PlotScatter(tib_padamid_wildtype[K562_lad_idx, ],
# "K562_wt_Ki67", "K562_wt_LMNB1",
# color_by = tib_padamid_wildtype$K562_wt_H3K27me3[K562_lad_idx])
# PlotScatter(tib_padamid_wildtype[K562_lad_idx, ],
# "K562_wt_Ki67", "K562_wt_LMNB1",
# color_by = tib_padamid_wildtype$K562_wt_H3K9me3[K562_lad_idx])Next, show that the balance between lamina and nucleolus is anti-correlated between the chromosomes.
# Calculate "max" score per chromosome
tib <- tib_padamid_wildtype %>%
dplyr::select(seqnames, contains("Ki67"), contains("LMNB1"),
-matches("NOV|HPA")) %>%
gather(key, value, -seqnames) %>%
mutate(idx = match(key, padamid_metadata_combined$sample),
sample = padamid_metadata_combined$sample[idx],
cell = padamid_metadata_combined$cell[idx],
target = padamid_metadata_combined$target[idx]) %>%
group_by(cell, target, seqnames) %>%
dplyr::summarise(q90 = quantile(value, 0.95, na.rm = T)) %>%
spread(target, q90)## `summarise()` has grouped output by 'cell', 'target'. You can override using the `.groups` argument.
# Plot simple scatterplot
tib %>%
ggplot(aes(x = Ki67, y = LMNB1, col = cell, shape = cell)) +
geom_point() +
geom_text(aes(y = LMNB1 + 0.1, label = seqnames),
col = "black", size = 3) +
geom_smooth(method = "lm") +
facet_grid(cell ~ .) +
scale_color_brewer(palette = "Set2") +
theme_bw() +
theme(aspect.ratio = 1)## `geom_smooth()` using formula 'y ~ x'
tib %>%
ggplot(aes(x = Ki67, y = LMNB1, col = cell, shape = cell)) +
geom_point() +
geom_smooth(method = "lm") +
scale_color_brewer(palette = "Set2") +
theme_bw() +
theme(aspect.ratio = 1)## `geom_smooth()` using formula 'y ~ x'
Sevaral conclusions:
sessionInfo()## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.32 corrr_0.4.3 GGally_2.1.1
## [4] RColorBrewer_1.1-2 rtracklayer_1.50.0 GenomicRanges_1.42.0
## [7] GenomeInfoDb_1.26.7 IRanges_2.24.1 S4Vectors_0.28.1
## [10] BiocGenerics_0.36.1 forcats_0.5.1 stringr_1.4.0
## [13] dplyr_1.0.5 purrr_0.3.4 readr_1.4.0
## [16] tidyr_1.1.3 tibble_3.1.1 ggplot2_3.3.3
## [19] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-152 bitops_1.0-6
## [3] matrixStats_0.58.0 fs_1.5.0
## [5] lubridate_1.7.10 httr_1.4.2
## [7] tools_4.0.5 backports_1.2.1
## [9] bslib_0.2.4 utf8_1.2.1
## [11] R6_2.5.0 mgcv_1.8-35
## [13] DBI_1.1.1 colorspace_2.0-0
## [15] withr_2.4.2 tidyselect_1.1.0
## [17] compiler_4.0.5 cli_2.4.0
## [19] rvest_1.0.0 Biobase_2.50.0
## [21] xml2_1.3.2 DelayedArray_0.16.3
## [23] labeling_0.4.2 sass_0.3.1
## [25] scales_1.1.1 digest_0.6.27
## [27] Rsamtools_2.6.0 rmarkdown_2.7
## [29] XVector_0.30.0 pkgconfig_2.0.3
## [31] htmltools_0.5.1.1 MatrixGenerics_1.2.1
## [33] highr_0.9 dbplyr_2.1.1
## [35] rlang_0.4.10 readxl_1.3.1
## [37] rstudioapi_0.13 farver_2.1.0
## [39] jquerylib_0.1.3 generics_0.1.0
## [41] jsonlite_1.7.2 BiocParallel_1.24.1
## [43] RCurl_1.98-1.3 magrittr_2.0.1
## [45] GenomeInfoDbData_1.2.4 Matrix_1.3-2
## [47] Rcpp_1.0.6 munsell_0.5.0
## [49] fansi_0.4.2 lifecycle_1.0.0
## [51] stringi_1.5.3 yaml_2.2.1
## [53] SummarizedExperiment_1.20.0 zlibbioc_1.36.0
## [55] plyr_1.8.6 grid_4.0.5
## [57] crayon_1.4.1 lattice_0.20-41
## [59] splines_4.0.5 Biostrings_2.58.0
## [61] haven_2.4.0 hms_1.0.0
## [63] pillar_1.6.0 codetools_0.2-18
## [65] reprex_2.0.0 XML_3.99-0.6
## [67] glue_1.4.2 evaluate_0.14
## [69] modelr_0.1.8 vctrs_0.3.7
## [71] cellranger_1.1.0 gtable_0.3.0
## [73] reshape_0.8.8 assertthat_0.2.1
## [75] xfun_0.22 broom_0.7.6
## [77] GenomicAlignments_1.26.0 ellipsis_0.3.1